## this code chunk fits the clustering pipelines on the training data
#### choose imputation methods ####
data_ls <- list(
"Mean-imputed" = data_mean_imputed$train,
"RF-imputed" = data_rf_imputed$train
)
#### choose number of features ####
feature_modes <- list(
"Small" = 7,
"Medium" = 11,
"Big" = 19
)
#### choose ks ####
ks <- 2:30
#### choose dimension reduction methods ####
# raw data
identity_fun_ls <- list("Raw" = function(x) x)
# pca
pca_fun_ls <- list("PCA" = purrr::partial(fit_pca, ndim = 4))
# tsne
tsne_perplexities <- c(30, 100)
tsne_fun_ls <- purrr::map(
tsne_perplexities,
~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
) |>
setNames(sprintf("tSNE (perplexity = %d)", tsne_perplexities))
# putting it together
dr_fun_ls <- c(
identity_fun_ls,
pca_fun_ls,
tsne_fun_ls
)
#### choose clustering methods ####
# kmeans
kmeans_fun_ls <- list("K-means" = purrr::partial(fit_kmeans, ks = ks))
# hierarchical clustering
hclust_params <- list(
d = c("euclidean"),
linkage = c("complete", "ward.D")
) |>
expand.grid()
hclust_fun_ls <- purrr::map(
1:nrow(hclust_params),
~ purrr::partial(
fit_hclust,
ks = ks,
d = hclust_params$d[[.x]],
linkage = hclust_params$linkage[[.x]]
)
) |>
setNames(
sprintf(
"Hierarchical (dist = %s, link = %s)",
hclust_params$d, hclust_params$linkage
)
)
# spectral clustering
n_neighbors <- c(5, 30, 60, 100)
spectral_fun_ls <- purrr::map(
n_neighbors,
~ purrr::partial(
fit_spectral_clustering,
ks = ks,
affinity = "nearest_neighbors",
n_neighbors = .x
)
) |>
setNames(sprintf("Spectral (n_neighbors = %s)", n_neighbors))
# putting it together
clust_fun_ls <- c(
kmeans_fun_ls,
hclust_fun_ls,
spectral_fun_ls
)
#### Fit Clustering Pipelines ####
pipe_tib <- tidyr::expand_grid(
data = data_ls,
feature_mode = feature_modes,
dr_method = dr_fun_ls,
clust_method = clust_fun_ls
) |>
dplyr::mutate(
impute_mode_name = names(data),
feature_mode_name = names(feature_mode),
dr_method_name = names(dr_method),
clust_method_name = names(clust_method),
name = stringr::str_glue(
"{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
)
) |>
# remove some clustering pipelines to reduce computation burden
dplyr::filter(
# remove all big feature set + dimension-reduction runs
!((dr_method_name != "Raw") & (feature_mode_name == "Big"))
)
pipe_ls <- split(pipe_tib, seq_len(nrow(pipe_tib))) |>
setNames(pipe_tib$name)
fit_results_fname <- file.path(RESULTS_PATH, "clustering_fits_train.rds")
if (!file.exists(fit_results_fname)) {
library(future)
plan(multisession, workers = NCORES)
# fit clustering pipelines (if not already cached)
clust_fit_ls <- furrr::future_map(
pipe_ls,
function(pipe_df) {
g <- create_preprocessing_pipeline(
feature_mode = pipe_df$feature_mode[[1]],
preprocess_fun = pipe_df$dr_method[[1]]
)
clust_out <- pipe_df$clust_method[[1]](
data = pipe_df$data[[1]], preprocess_fun = g
)
return(clust_out)
},
.options = furrr::furrr_options(
seed = TRUE,
globals = list(
ks = ks,
create_preprocessing_pipeline = create_preprocessing_pipeline,
get_abundance_data = get_abundance_data,
tsne_perplexities = tsne_perplexities,
hclust_params = hclust_params,
n_neighbors = n_neighbors,
fit_kmeans = fit_kmeans,
fit_hclust = fit_hclust,
fit_spectral_clustering = fit_spectral_clustering
)
)
)
# save fitted clustering pipelines
saveRDS(clust_fit_ls, file = fit_results_fname)
} else {
# read in fitted clustering pipelines (if already cached)
clust_fit_ls <- readRDS(fit_results_fname)
}